In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import re
/Users/lettyuy/opt/anaconda3/lib/python3.9/site-packages/scipy/__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.0
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
In [2]:
df = pd.read_csv("Hot 100.csv")
df['chart_date'] = pd.to_datetime(df['chart_date'])
df['chart_debut'] = pd.to_datetime(df['chart_debut'])
df['chart_year'] = df['chart_date'].dt.year
df.head()
Out[2]:
chart_position chart_date song performer song_id instance time_on_chart consecutive_weeks previous_week peak_position worst_position chart_debut chart_url chart_year
0 84 1990-05-05 "B" Girls Young And Restless "B" GirlsYoung And Restless 1.0 1 NaN NaN 84 84 1990-05-05 https://www.billboard.com/charts/hot-100/1990-... 1990
1 78 1990-05-12 "B" Girls Young And Restless "B" GirlsYoung And Restless 1.0 2 1.0 84.0 78 84 1990-05-05 https://www.billboard.com/charts/hot-100/1990-... 1990
2 68 1990-05-19 "B" Girls Young And Restless "B" GirlsYoung And Restless 1.0 3 2.0 78.0 68 84 1990-05-05 https://www.billboard.com/charts/hot-100/1990-... 1990
3 60 1990-05-26 "B" Girls Young And Restless "B" GirlsYoung And Restless 1.0 4 3.0 68.0 60 84 1990-05-05 https://www.billboard.com/charts/hot-100/1990-... 1990
4 58 1990-06-02 "B" Girls Young And Restless "B" GirlsYoung And Restless 1.0 5 4.0 60.0 58 84 1990-05-05 https://www.billboard.com/charts/hot-100/1990-... 1990
In [3]:
#Create a new column extracting each individual artist from musical collaborations
df['individual_artist'] = df['performer'].apply(lambda x: re.split(r',|&| and | featuring | feat\. | ft\. ', x, flags=re.IGNORECASE))
df = df.explode('individual_artist')
df['individual_artist'] = df['individual_artist'].str.strip()
In [4]:
#Calculated the average chart positions for each song + individual artist
avg_chart_positions = df.groupby(['song', 'individual_artist'])['chart_position'].mean().round().astype(int).reset_index()
avg_chart_positions = avg_chart_positions.rename(columns={'chart_position': 'avg_chart_position'})
df = pd.merge(df, avg_chart_positions, on=['song', 'individual_artist'], how='left')

#Filter dataframe for only #1s
df_at_1 = df[df['chart_position'] == 1]

#Made sure songs are unique after filtering for #1 songs only
unique_songs_at_1 = df_at_1.groupby(['individual_artist', 'song']).size().reset_index().rename(columns={0: 'count'})
individual_artist_hits = unique_songs_at_1.groupby('individual_artist').size()

#Created a list of artists with only one hit and created a temporary dataframe to house the data
one_hit_artists_list = individual_artist_hits[individual_artist_hits == 1].index.tolist()
df_one_hit_wonders = df[(df['chart_position'] == 1) & df['individual_artist'].isin(one_hit_artists_list)]
df_one_hit_wonders = df_one_hit_wonders.drop_duplicates(subset=['song', 'individual_artist'])

#Created a list of artists with greater than or equal to 3 hits and created a temporary dataframe to house the data
artists_with_staying_power_list = individual_artist_hits[individual_artist_hits >= 3].index.tolist()
df_artists_with_staying_power = df[(df['chart_position'] == 1) & df['individual_artist'].isin(artists_with_staying_power_list)]
df_artists_with_staying_power = df_artists_with_staying_power.drop_duplicates(subset=['song', 'individual_artist'])
In [5]:
#Created a function to get only the top 10 per year in each of our two groups
def get_top_10_per_year(group):
    return group.nlargest(10, 'consecutive_weeks')

#Stored the labels under the Source column for each group
df_one_hit_wonders['Source'] = 'One Hit Wonders'
df_artists_with_staying_power['Source'] = 'Artists with Staying Power'

#Group the data of each group by year, and apply our function
top_10_one_hit_wonders_yearly = df_one_hit_wonders.groupby('chart_year').apply(get_top_10_per_year).reset_index(drop=True)
top_10_staying_power_yearly = df_artists_with_staying_power.groupby('chart_year').apply(get_top_10_per_year).reset_index(drop=True)

#Combine the two data sets into one
top_10_combined_yearly = pd.concat([top_10_one_hit_wonders_yearly, top_10_staying_power_yearly])

#Clean up column names accordingly
top_10_combined_yearly.rename(columns={'time_on_chart': 'Time on Chart', 'avg_chart_position': 'Average Chart Position', 'song': 'Song'}, inplace=True)
In [6]:
#Store the years and sources in variables
all_years = top_10_combined_yearly['chart_year'].unique()
all_sources = ["One Hit Wonders", "Artists with Staying Power"]
In [7]:
#Create Dataframe that has a row for every year-source combination to prepare it for data visualization.
expanded_data = []

for year in all_years:
    for Source in all_sources:
        subset = top_10_combined_yearly[(top_10_combined_yearly['chart_year'] == year) & (top_10_combined_yearly['Source'] == Source)]
        if subset.empty:
            expanded_data.append({
                'chart_year': year,
                'Source': Source,
                'Average Chart Position': np.nan,
                'Time on Chart': np.nan,
                'individual_artist': f'Placeholder {Source} {year}'
            })
        else:
            expanded_data.extend(subset.to_dict('records'))

expanded_df = pd.DataFrame(expanded_data)
expanded_df.head()
Out[7]:
chart_position chart_date Song performer song_id instance Time on Chart consecutive_weeks previous_week peak_position worst_position chart_debut chart_url chart_year individual_artist Average Chart Position Source
0 1.0 1958-11-29 To Know Him, Is To Love Him The Teddy Bears To Know Him, Is To Love HimThe Teddy Bears 1.0 11.0 10.0 3.0 1.0 88.0 1958-09-20 https://www.billboard.com/charts/hot-100/1958-... 1958 The Teddy Bears 24.0 One Hit Wonders
1 1.0 1958-11-08 It's Only Make Believe Conway Twitty It's Only Make BelieveConway Twitty 1.0 9.0 8.0 2.0 1.0 65.0 1958-09-13 https://www.billboard.com/charts/hot-100/1958-... 1958 Conway Twitty 22.0 One Hit Wonders
2 1.0 1958-11-15 Tom Dooley The Kingston Trio Tom DooleyThe Kingston Trio 1.0 8.0 7.0 2.0 1.0 83.0 1958-09-27 https://www.billboard.com/charts/hot-100/1958-... 1958 The Kingston Trio 19.0 One Hit Wonders
3 1.0 1958-09-27 It's All In The Game Tommy Edwards It's All In The GameTommy Edwards 1.0 7.0 6.0 3.0 1.0 96.0 1958-08-16 https://www.billboard.com/charts/hot-100/1958-... 1958 Tommy Edwards 19.0 One Hit Wonders
4 1.0 1958-08-23 Little Star The Elegants Little StarThe Elegants 1.0 4.0 3.0 2.0 1.0 18.0 1958-08-02 https://www.billboard.com/charts/hot-100/1958-... 1958 The Elegants 18.0 One Hit Wonders
In [8]:
#Visualize data!

fig = px.scatter(
    expanded_df,
    x="Average Chart Position",
    y="Time on Chart",
    animation_frame="chart_year",
    animation_group="individual_artist",
    hover_name="individual_artist",
    hover_data={"Song": True, "Source": False, "chart_year": False},
    color="Source",
    size_max=55,
    range_x=[top_10_combined_yearly['Average Chart Position'].max(), top_10_combined_yearly['Average Chart Position'].min()],
    range_y=[0, top_10_combined_yearly['Time on Chart'].max()]
)

x_mid = 30
y_mid = 30

fig.add_shape(
    go.layout.Shape(
        type="line",
        x0=x_mid,
        x1=x_mid,
        y0=0,
        y1=top_10_combined_yearly['Time on Chart'].max(),
        line=dict(color="Black", dash="dash", width=0.5)
    )
)

fig.add_shape(
    go.layout.Shape(
        type="line",
        x0=top_10_combined_yearly['Average Chart Position'].max(),
        x1=top_10_combined_yearly['Average Chart Position'].min(),
        y0=y_mid,
        y1=y_mid,
        line=dict(color="Black", dash="dash", width=0.5)
    )
)

fig.add_annotation(
    text="Letty Uy - CIS 9655",
    xref="paper",
    yref="paper",
    x=1,  
    y=1,  
    showarrow=False,
    font=dict(
        size=12,
        color="black"
    )
)
fig.add_annotation(
    text="Lower average rank, long duration",
    x=x_mid + (x_mid / 2),
    y=y_mid + (y_mid / 2),
    showarrow=False
)

fig.add_annotation(
    text="Higher average rank, long duration",
    x=x_mid - (x_mid / 2),
    y=y_mid + (y_mid / 2),
    showarrow=False
)

fig.add_annotation(
    text="Higher average rank, short duration",
    x=x_mid - (x_mid / 2),
    y=y_mid - (y_mid / 2),
    showarrow=False
)

fig.add_annotation(
    text="Lower average rank, short duration",
    x=x_mid + (x_mid / 2),
    y=y_mid - (y_mid / 2),
    showarrow=False
)

fig.add_annotation(
    text="Letty Uy - CIS 9655",
    xref="paper",
    yref="paper",
    x=1,  
    y=1,  
    showarrow=False,
    font=dict(
        size=12,
        color="black"
    )
)

fig.update_layout(
    xaxis_title="Average Chart Position",
    yaxis_title="Total Weeks at #1"
)

fig.show()